Trained a doc2vec embedding based on skip-gram on the politician level i.e. all tweets of one politician are treated as a document.
\(\Rightarrow\) One obtains a 300 dimensional embedding vector for each politician.
# load necessary functions and embeddings
load(paste0(here::here(), "/doc2vec_graph/politician_level_dco2vec.RData"))
load(paste0(here::here(), "/doc2vec_graph/word_embedding_utils.RData"))
load(paste0(here::here(), "/doc2vec_graph/doc_clust_utils.RData"))
load(paste0(here::here(), "/doc2vec_graph/graph_metrics.RData"))
library(tidyverse)
library(visNetwork)
library(tidygraph)
library(ggraph)
library(patchwork)
# precompute some important quantities
pairwise_doc_sim <- calc_pairwise_doc_sim(document_embedding)
edges <- data.frame(from = pairwise_doc_sim[, 1],
to = pairwise_doc_sim[, 2],
value = pairwise_doc_sim[, 3],
title = paste0(round(pairwise_doc_sim[, 3], 5)))
knn_edges <- create_knn_edges(
pairwise_doc_similarity = pairwise_doc_sim,
nodes = nodes,
k = 5
)
# UWOT
umap_doc_emb <- uwot::umap(
X = document_embedding,
n_components = 3,
approx_pow = TRUE,
n_epochs = 20
)
colnames(umap_doc_emb) <- c("x", "y", "z")
umap_doc_emb %>%
as_tibble() %>%
bind_cols(nodes) %>%
plotly::plot_ly(x = ~x, y = ~y, z = ~z, color = ~group,
colors = c("blue", "black", "darkgrey",
"violet", "orange",
"green", "red"),
marker = list(symbol = "circle",
size = 3),
text = ~label) %>%
plotly::add_markers(opacity = 1) %>%
plotly::layout(
legend = list(itemsizing = "constant", font = list(size = 15)),
scene = list(
xaxis = list(title = ""),
yaxis = list(title = ""),
zaxis = list(title = "")
),
title = "Umap dimensionality reduction of politician embedding"
)
Apply hierarchical clustering \(\rightarrow\) analyze the dendrograms.
cluster_size <- nodes %>%
group_by(group) %>%
summarise(n_cluster = ceiling(n() * 0.3))
afd_clust <- doc_emb_clustering(
document_embedding = document_embedding,
nodes = nodes,
n_cluster = cluster_size$n_cluster[cluster_size$group == "AFD"],
party = "AFD",
branch_width = 2
)
fdp_clust <- doc_emb_clustering(
document_embedding = document_embedding,
nodes = nodes,
n_cluster = cluster_size$n_cluster[cluster_size$group == "FDP"],
party = "FDP",
branch_width = 2
)
cdu_clust <- doc_emb_clustering(
document_embedding = document_embedding,
nodes = nodes,
n_cluster = cluster_size$n_cluster[cluster_size$group == "CDU"],
party = "CDU",
branch_width = 2
)
csu_clust <- doc_emb_clustering(
document_embedding = document_embedding,
nodes = nodes,
n_cluster = cluster_size$n_cluster[cluster_size$group == "CSU"],
party = "CSU",
branch_width = 2
)
linke_clust <- doc_emb_clustering(
document_embedding = document_embedding,
nodes = nodes,
n_cluster = cluster_size$n_cluster[cluster_size$group == "Die Linke"],
party = "Die Linke",
branch_width = 2
)
gruene_clust <- doc_emb_clustering(
document_embedding = document_embedding,
nodes = nodes,
n_cluster = cluster_size$n_cluster[cluster_size$group == "Grüne"],
party = "Grüne",
branch_width = 2
)
spd_clust <- doc_emb_clustering(
document_embedding = document_embedding,
nodes = nodes,
n_cluster = cluster_size$n_cluster[cluster_size$group == "SPD"],
party = "SPD",
branch_width = 2
)
gruene_clust$dendro
umap_cluster_plot(
document_embedding = document_embedding,
nodes = nodes,
cluster_assignments = gruene_clust$cluster_assignments,
party = "Grüne"
)
spd_clust$dendro
umap_cluster_plot(
document_embedding = document_embedding,
nodes = nodes,
cluster_assignments = spd_clust$cluster_assignments,
party = "SPD"
)
Calculate pairwise similarities within the document embedding. The resulting similarities can be displayed as a undirected similarity graph with
This very dense graph can be represented either in a
The graph for each party can be displayed vizually along the cluster assignments that were computed on the document embedding.
Here the 5-NN Graph is used.
party_graph_docclust(
party_char = "AFD",
cluster_assignments = afd_clust$cluster_assignments,
nodes = nodes,
edges = knn_edges
)
Here the 5-NN Graph is used.
party_graph_docclust(
party_char = "SPD",
cluster_assignments = spd_clust$cluster_assignments,
nodes = nodes,
edges = knn_edges
)
Having these similarity graphs one can use common graph metrics to assess heterogeneity within parties and compare these metrics.
Here the neighborhood graph is used.
metrics_neighboring_graph_0.001$plot
metrics_neighboring_graph_0.001$hop_plot